athletes_events <- read_csv("data/athletes_and_events.csv", show_col_types = FALSE)
noc_regions <- read_csv("data/noc_regions.csv", show_col_types = FALSE)
gdp_pop <- read_csv("data/gdp_pop.csv", show_col_types = FALSE)
names(gdp_pop) <- make.names(names(gdp_pop))
athlete_region <- athletes_events %>%
left_join(noc_regions, by = "NOC") %>%
left_join(gdp_pop, by = c("NOC" = "Code"))
#identify the NOC regions where there are more than one and overwrite the names
noc_change <- athlete_region %>%
select(NOC, region) %>%
unique() %>%
group_by(region) %>%
count() %>%
filter(n > 1)
#alter some of the codes
athlete_region <- athlete_region %>%
mutate(new_NOC = case_when(region == "Australia" ~ "AUS",
region == "Canada" ~ "CAN",
region == "China" ~ "CHN",
region == "Czech Republic" ~ "CZE",
region == "Germany" ~ "GER",
region == "Greece" ~ "GRE",
region == "Malaysia" ~ "MAL",
region == "Russia" ~ "RUS",
region == "Serbia" ~ "SRB",
region == "Syria" ~ "SYR",
region == "Trinidad" ~ "TTO",
region == "Vietnam" ~ "VIE",
region == "Yemen" ~ "YEM",
region == "Zimbabwe" ~ "ZIM",
)) %>%
mutate(NOC = ifelse(is.na(new_NOC) == TRUE, NOC, new_NOC)) #replace the value
winter_game_count <- athlete_region %>%
filter(Season == "Winter") %>%
group_by(NOC) %>%
summarize(winter_count = n_distinct(Year))
winter_game_medals <- athlete_region %>%
filter(Season == "Winter", is.na(Medal) == FALSE) %>%
select(NOC, region, Medal, Event) %>%
unique() %>% #allocate one medal per event (so team events dont have duplicates)
mutate(count_column = 1) %>%
group_by(NOC, region, Medal) %>%
summarize(medal_count = sum(count_column))
Visual comparison of medal count by country:
#combine the 2 data frames to view the medal count and winter games count for all countries
top_medals <- winter_game_count %>%
inner_join(winter_game_medals, by = "NOC") %>%
pivot_wider(names_from = Medal, values_from = medal_count, values_fill = 0) %>%
rename("Winter_Games_Count" = "winter_count") %>%
group_by(region) %>%
mutate(Total_Medals = sum(Bronze, Gold, Silver)) %>%
arrange(desc(Total_Medals)) %>%
select(NOC, region, Winter_Games_Count, Gold, Silver, Bronze, Total_Medals)
pretty_headers <-
gsub("[_]", " ", colnames(top_medals)) %>%
str_to_title()
datatable(top_medals,
filter = list(position = "top"),
options = list(language = list(sSearch = "Filter:")),
colnames = pretty_headers) %>%
formatStyle('Winter_Games_Count',
background = styleColorBar(range(top_medals$Winter_Games_Count),'lightblue'),
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('Total_Medals',
background = styleColorBar(range(top_medals$Total_Medals),'pink'),
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center') %>%
formatStyle('Gold', backgroundColor = 'gold') %>%
formatStyle('Silver', backgroundColor = 'lightgrey') %>%
formatStyle('Bronze', backgroundColor = '#CD9600')
winter_game_medals %>%
group_by(NOC) %>%
mutate(total_medals = sum(medal_count)) %>%
filter(total_medals > 80) %>%
mutate(Medal = recode_factor(Medal, 'Gold' = "Gold", 'Silver' = "Silver", 'Bronze' = "Bronze")) %>%
mutate(region = case_when(region == "USA" ~ "United States",
TRUE ~ region)) %>%
ggplot(aes(x = reorder(region, -total_medals), y = medal_count, label = medal_count)) +
geom_area(aes(group = Medal, fill = Medal)) +
geom_line(aes(group = Medal), position = "stack", color = "white") +
geom_text(aes(label = medal_count), position = position_stack(vjust = 0.5), size = 3) +
scale_fill_manual(values=c("gold", "lightgrey", "#CD9600")) +
theme_bw() +
labs(x = "Country",
y = "Medal Count",
title = "Countries with over 100 Lifetime Winter Olympic Medals",
caption = "International Olympic Committee (2016)")
#prep data for chart
medals_over_time <- athlete_region %>%
filter(Season == "Winter",
is.na(Medal) == FALSE) %>%
select(NOC, region, Year, Medal, Event) %>%
unique() %>%
mutate(medal_count = 1) %>%
group_by(NOC, Year, region, Medal) %>%
summarize(total_medals = sum(medal_count)) %>%
pivot_wider(names_from = Medal, values_from = total_medals, values_fill = 0)
medals_over_time %>%
group_by(NOC) %>%
mutate(total_medals = sum(Gold, Bronze, Silver)) %>%
filter(total_medals > 160) %>%
ungroup() %>%
group_by(NOC, Year) %>%
mutate(games_medals = sum(Gold, Bronze, Silver)) %>%
ungroup() %>%
arrange(NOC, Year) %>%
group_by(NOC) %>%
mutate(cumulative_medals = cumsum(games_medals)) %>%
ggplot(aes(x = Year, y = cumulative_medals)) +
geom_textline(aes(color = NOC, group = NOC, label = region, size = 0.5, hjust = 0.95)) +
labs(x = "Year",
y = "All-Time Medal Count",
title = "Cumulative Winter Olympic Medal Counts: \n Germany and Russia Race Ahead in All-Time Medals",
caption = "International Olympic Committee (2016)") +
theme_bw() +
theme(legend.position = "none")
Commentary:
While the first chart depicting the top countries’ lifetime Winter Olympic medals tells a compelling story of their respective gold, silver, and bronze medal breakdown, I prefer and would recommend the second visualization that shows the total Olympic medal count over time. It tells a more engaging story in terms of how top Winter Olympic countries have jostled for the top spot, and it offers a big-picture perspective on medal counts that is not isolated to a single Winter Olympic contest, and instead, shows a pattern over all Olympics (a perspective we don’t often see). For instance, we can see Germany and Russia surpass Norway around the late 1980’s. In addition, while the US likes to think of itself as a consistent leader in the Olympic Games, it’s clear from ths second visualization that it ranks fourth in overall medals.
I used total medals as my measure of success.
library(dplyr)
athlete_region %>%
filter(Season == "Winter", is.na(Medal) == FALSE, is.na(GDP.per.Capita) == FALSE) %>%
select(NOC, region, Event, GDP.per.Capita) %>%
group_by(NOC, region, Event, GDP.per.Capita) %>%
unique() %>%
count() %>%
ungroup() %>%
group_by(NOC, region, GDP.per.Capita) %>%
summarize(medals_per_GDP = sum(n)/GDP.per.Capita) %>%
unique() %>%
filter(medals_per_GDP > 0.001) %>%
ungroup() %>%
mutate(index_gdp = GDP.per.Capita / mean(GDP.per.Capita),
to_highlight = case_when(NOC == "RUS" ~ "RUS",
NOC == "USA" ~ "USA",
TRUE ~ "other")) %>%
ggplot(aes(x = index_gdp, y = medals_per_GDP, label = region, color = to_highlight)) +
geom_point() +
geom_text_repel() +
scale_color_manual(values = c("RUS"="red", "USA"="blue", "other" = "grey"), guide = FALSE) +
theme_bw() +
theme(legend.position = "none") +
labs(x = "Indexed GDP",
y = "Medals per Unit of GDP per Capita",
title = "A Different Cold War:\nRussia Produces Many More Medals per Unit GDP Than US",
caption = "International Olympic Committee (2016)")
##CREATED IN GGPLOT
athlete_region %>%
filter(Season == "Winter",
is.na(Medal) == FALSE,
is.na(Population) == FALSE) %>%
select(NOC, Medal, region, Event, Population) %>%
group_by(NOC, Medal, region, Event, Population) %>%
unique() %>%
count() %>%
ungroup() %>%
group_by(NOC, region, Population) %>%
summarize(total_medals = sum(n)) %>%
ungroup() %>%
mutate(medals_per_pop = total_medals / Population) %>%
arrange(desc(medals_per_pop)) %>%
slice_head(n = 15) %>%
mutate(to_highlight = as.factor(ifelse(NOC == "LIE", "yes", "no"))) %>%
ggplot(aes(x = Population, y = medals_per_pop, label = region, color = to_highlight)) +
geom_point() +
geom_text_repel() +
scale_y_continuous(trans='log') +
scale_x_continuous(trans='log', labels = scales::comma) +
scale_color_manual(values = c("yes"="blue", "no"="darkgray" ), guide = FALSE) +
theme_bw() +
theme(legend.position = "none") +
labs(x = "Population (Log Transformed)",
y = "Total Medals per Person (Log Transformed)",
title = "Total Winter Olympic Medals per Citizen: \nLiechtenstein Shines in Most Medals per Citizen",
caption = "International Olympic Committee (2016)")
##CREATED IN PLOTLY
athlete_region %>%
filter(Season == "Winter",
is.na(Medal) == FALSE,
is.na(Population) == FALSE) %>%
select(NOC, Medal, region, Event, Population) %>%
group_by(NOC, Medal, region, Event, Population) %>%
unique() %>%
count() %>%
ungroup() %>%
group_by(NOC, region, Population) %>%
summarize(total_medals = sum(n)) %>%
ungroup() %>%
mutate(medals_per_pop = total_medals / Population) %>%
arrange(desc(medals_per_pop)) %>%
slice_head(n = 15) %>%
mutate(to_highlight = as.factor(ifelse(NOC == "LIE", "yes", "no"))) %>%
plot_ly(x = ~Population,
y = ~medals_per_pop,
color = ~region,
type = "scatter",
mode = "markers",
hoverinfo = 'text',
text = ~paste("Country:", region, "</br></br> Population (actual, untransformed):", scales::comma(Population),
'</br> Medals per Citizen (actual, untransformed):',
round(medals_per_pop,6))) %>%
layout(showlegend = T,
title = "Total Winter Olympic Medals per Citizen: \nLiechtenstein Shines in Most Medals per Citizen",
yaxis = list(title = "Total Medals per Person (Log Transformed)", type = "log"),
xaxis = list(title = "Population (Log Transformed)", type = "log"))
# add_annotations(text = ~region, xanchor = "left", showarrow = F)
Plotly Justification This plotly chart’s interactivity is helpful for a reader because they can obtain specific data (like population and “medal count per person”) and the country name in one pass by hovering over the data points of interest. If they want specific numbers, it’s easy to obtain the specific population rather than a rough estimate based on the axes’ tick marks. Third, the interactivity makes the chart’s appearance cleaner, which is particularly salient since some points are very close together; after all, static labels would make the chart look cluttered.
Step 1. Extract Host Country Data and clean up the data pull
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
host_countries <- hosts %>%
filter(Winter != "") %>%
select(City, Country, Year) %>%
rename(Host_Country = Country) %>%
mutate(Host_Country = gsub("United States", "USA", Host_Country)) %>%
mutate(Host_Country = gsub("Yugoslavia", "Serbia", Host_Country))
#fix the one error in the data
host_countries[5, 1] <- "Sapporo"
host_countries[5, 2] <- "Japan"
host_countries[24, 2] <- "Russia"
host_countries[14, 1] <- "Innsbruck"
Step 2. Calculate the average medals for host countries when they compete at home vs abroad.
#average home country medals
home_country_adv <- athlete_region %>%
inner_join(host_countries) %>%
filter(Season == "Winter",
region == Host_Country) %>%
group_by(Event, Games, region) %>%
unique() %>%
ungroup() %>%
filter(is.na(Medal) == FALSE) %>%
group_by(region, Games) %>%
count() %>%
ungroup() %>%
group_by(region) %>%
summarize(avg_hc_medals_adv = round(mean(n), 0))
#non-home-country-medals
no_hc_adv <- athlete_region %>%
filter(Season == "Winter") %>%
left_join(host_countries) %>%
group_by(Event, Games, region) %>%
unique() %>%
ungroup() %>%
filter(is.na(Medal) == FALSE,
region != Host_Country,
region %in% home_country_adv$region) %>%
group_by(region, Games) %>%
count() %>%
ungroup() %>%
group_by(region) %>%
summarize(avg_hc_medals_noadv = round(mean(n),0))
Step 3. Merge data sets and visualize.
no_hc_adv %>%
left_join(home_country_adv) %>%
rename(avg_hc_medals_1 = avg_hc_medals_noadv,
avg_hc_medals_2 = avg_hc_medals_adv) %>%
mutate(difference = avg_hc_medals_2 - avg_hc_medals_1) %>%
pivot_longer(cols = c("avg_hc_medals_1", "avg_hc_medals_2"),
names_to = "status",
values_to = "medals",
values_drop_na = FALSE) %>%
mutate(region = factor(region, levels =
c("Russia", "Canada", "USA", "Norway", "France",
"Japan", "Austria", "Italy", "Serbia",
"Germany"))) %>%
ggplot(aes(x = status, y = medals, group = region, fill = status, label = medals)) +
scale_fill_manual(values = c("avg_hc_medals_1"="darkgray", "avg_hc_medals_2"="tomato"),
labels = c("No Host Country Advantage", "Host Country Advantage")) +
geom_bar(stat = "identity") +
geom_point() +
geom_line(group = 1) +
geom_text(size = 3, vjust = 2.2, color = "white") +
facet_grid(~region) +
theme_bw() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(y = "Avg Medal Count per Winter Olympic Games",
title = "Host Country Advantage: Canada & Russia Make the Most out of Hosting",
caption = "International Olympic Committee (2016)") +
theme(legend.position = "bottom",
legend.title=element_blank())
athlete_region %>%
filter(Season == "Winter",
is.na(Medal) == FALSE) %>%
group_by(Name, region) %>%
count(Medal) %>%
mutate(total_medals = sum(n)) %>%
arrange(desc(total_medals)) %>%
filter(total_medals > 8) %>%
mutate(Medal = recode_factor(Medal, 'Gold' = "Gold", 'Silver' = "Silver", 'Bronze' = "Bronze")) %>%
ggplot(aes(x = reorder(paste0(Name, " (", region,")"), -total_medals), y = n, fill = Medal, label = n)) +
geom_col(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
scale_fill_manual(values=c("gold", "lightgrey", "#CD9600")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 9)) +
labs(x = "Winter Athlete",
y = "Medal Count",
title = "Medal Counts of Most Successful Winter Athletes (1896 - 2016)",
caption = "International Olympic Committee (2016)") +
theme(legend.position="bottom")
##GGPLOT VERSION OF TOP ATHLETES VS. WEIGHT
athlete_region %>%
filter(Season == "Winter",
is.na(Medal) == FALSE) %>%
group_by(Name, region, Sex, Weight) %>%
count(Medal) %>%
summarize(total_medals = sum(n)) %>%
arrange(desc(total_medals)) %>%
ungroup() %>%
ggplot(mapping = aes(x = as.factor(reorder(total_medals, total_medals)), y = Weight)) +
geom_boxplot(fill = "lightblue") +
geom_smooth(method = "loess", se=FALSE, aes(group=1), color = "violet") +
scale_x_discrete(limits = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) +
theme_bw() +
theme(legend.position = "none") +
labs(x = "Individual Athlete's Winter Olympic Medal Count (1896 - 2016)",
y = "Individual Athlete's Weight (kg)",
title = "Tighter and Lower Weight Ranges Among Top Winter Olympic Athletes",
caption = "International Olympic Committee (2016)")
##PLOTLY VERSION OF TOP ATHLETES VS. WEIGHT
athlete_region %>%
filter(Season == "Winter",
is.na(Medal) == FALSE) %>%
group_by(Name, region, Sex, Weight) %>%
count(Medal) %>%
summarize(total_medals = sum(n)) %>%
arrange(desc(total_medals)) %>%
ungroup() %>%
plot_ly(x = ~total_medals, y = ~Weight, color = ~Sex, type = "box") %>%
layout(boxmode = "group",
title = "Tighter and Lower Weight Ranges Among Top Winter Olympic Athletes",
yaxis = list(title = "Individual Athlete's Weight (kg)", showticklabels = TRUE),
xaxis = list(title = "Individual Athlete's Winter Olympic Medal Count (1896 - 2016)", dtick=1),
hoverlabel = list(bgcolor = "white"))
Plotly Justification The interactive chart above is particularly helpful, because it presents detailed information in an aesthetically-pleasing manner. Instead of details like the mean, quartiles, and min/max values cluttering a static chart, the interactivity presents the data in a clean manner and invites the user to hover over the box plot for richer detail, should they be curious.
Data Prep
gamesattended <- athlete_region %>%
filter(Season == "Winter") %>%
group_by(Name, region, NOC) %>%
count() %>%
summarize(games_attended = sum(n)) %>%
arrange(desc(games_attended))
wins <- athlete_region %>%
filter(Season == "Winter",
is.na(Medal) == FALSE) %>%
group_by(Name, region, NOC) %>%
summarize(total_medals = n())
unique_events <- athlete_region %>%
filter(Season == "Winter") %>%
select(Name, NOC, region, Event) %>%
unique() %>%
group_by(Name, NOC, region) %>%
summarize(compete_events = n())
final_df <- gamesattended %>%
left_join(unique_events) %>%
left_join(wins) %>%
mutate(total_medals = ifelse(is.na(total_medals) == TRUE, 0, total_medals)) %>%
mutate(winning_percentage = round(total_medals / games_attended,2)) %>%
arrange(desc(games_attended)) %>%
rename(winter_games_attended = games_attended,
Unique_Competitive_Events = compete_events,
Country = region) %>%
select(-NOC)
I propose a data table that shows athletes’ Winter Olympics event participation along with their winning percentage (in terms of how many medals they’ve won compared to the events they’ve competed in). It will be interesting for readers to see who are the most prolific athletes, the number of unique events that athletes have participated in, and who are the rising stars by country.
pretty_headers2 <-
gsub("[_]", " ", colnames(final_df)) %>%
str_to_title()
datatable(final_df,
filter = list(position = "top"),
options = list(language = list(sSearch = "Filter:")),
colnames = pretty_headers2,
rownames = FALSE) %>%
formatStyle("winning_percentage",
background = styleColorBar(range(final_df$winning_percentage),"gold"),
backgroundRepeat = "no-repeat",
backgroundPosition = "center") %>%
formatStyle("total_medals",
background = styleColorBar(range(final_df$total_medals),"violet"),
backgroundRepeat = "no-repeat",
backgroundPosition = "center") %>%
formatStyle("winter_games_attended",
background = styleColorBar(range(final_df$winter_games_attended),"lightblue"),
backgroundRepeat = "no-repeat",
backgroundPosition = "center") %>%
formatStyle("Unique_Competitive_Events",
background = styleColorBar(range(final_df$Unique_Competitive_Events),"lightgreen"),
backgroundRepeat = "no-repeat",
backgroundPosition = "center") %>%
formatStyle("Country", backgroundColor = "black", color = "white") %>%
formatStyle("Name", fontWeight = "bold")